{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load in \n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "# Input data files are available in the \"../input/\" directory.\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
    "\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
    "    for filename in filenames:\n",
    "        print(os.path.join(dirname, filename))\n",
    "\n",
    "# Any results you write to the current directory are saved as output."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install algo-timer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import lib\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, auc\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import balanced_accuracy_score\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn import svm\n",
    "from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "mpl.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from xgboost import plot_importance\n",
    "from sklearn.metrics import make_scorer\n",
    "import time\n",
    "from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from algotimer import Timer, TimerPloter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "",
    "_uuid": ""
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "testX = pd.read_csv(\"../input/testX.csv\")\n",
    "X = pd.read_csv(\"../input/trainX.csv\")\n",
    "y = pd.read_csv(\"../input/trainY.csv\", header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = testX.id\n",
    "testX = testX.drop(['id'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ROC\n",
    "def plotROC(y_test, y_score, pltName):\n",
    "    fpr = dict()\n",
    "    tpr = dict()\n",
    "    roc_auc = dict()\n",
    "    fpr[2], tpr[2], _ = roc_curve(y_test, y_score)\n",
    "    roc_auc[2] = auc(fpr[2], tpr[2])\n",
    "\n",
    "    # Compute micro-average ROC curve and ROC area\n",
    "    fpr[\"micro\"], tpr[\"micro\"], _ = roc_curve(y_test, y_score)\n",
    "    roc_auc[\"micro\"] = auc(fpr[\"micro\"], tpr[\"micro\"])\n",
    "    # plot it\n",
    "    plt.figure()\n",
    "    lw = 2\n",
    "    plt.plot(fpr[2], tpr[2], color='darkorange',\n",
    "             lw=lw, label='AUC=%0.2f' % roc_auc[2])\n",
    "    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n",
    "    plt.xlim([0.0, 1.0])\n",
    "    plt.ylim([0.0, 1.05])\n",
    "    plt.xlabel('FPR')\n",
    "    plt.ylabel('TPR')\n",
    "    plt.title('ROC of ' + pltName)\n",
    "    plt.legend(loc=\"best\")\n",
    "    plt.savefig(f'{pltName}.png', bbox_inches='tight', dpi=300)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, stratify=y, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objective(params):\n",
    "    time1 = time.time()\n",
    "    params = {\n",
    "        'max_depth': int(params['max_depth']),\n",
    "        'gamma': \"{:.3f}\".format(params['gamma']),\n",
    "        'subsample': \"{:.2f}\".format(params['subsample']),\n",
    "        'reg_alpha': \"{:.3f}\".format(params['reg_alpha']),\n",
    "        'reg_lambda': \"{:.3f}\".format(params['reg_lambda']),\n",
    "        'learning_rate': \"{:.3f}\".format(params['learning_rate']),\n",
    "        'num_leaves': '{:.3f}'.format(params['num_leaves']),\n",
    "        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),\n",
    "        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),\n",
    "        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),\n",
    "        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])\n",
    "    }\n",
    "\n",
    "    print(\"\\n############## New Run ################\")\n",
    "    print(f\"params = {params}\")\n",
    "    FOLDS = 5\n",
    "    count = 1\n",
    "    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)\n",
    "    score_mean = 0\n",
    "    for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()):\n",
    "        clf = xgb.XGBClassifier(\n",
    "            objective=\"binary:logistic\",\n",
    "            n_estimators=300, random_state=4, verbose=True, \n",
    "            tree_method='hist', \n",
    "            scale_pos_weight=136,\n",
    "            n_jobs=-1,\n",
    "            **params\n",
    "        )\n",
    "\n",
    "        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]\n",
    "        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]\n",
    "        \n",
    "        clf.fit(X_tr, y_tr.values.ravel())\n",
    "        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)\n",
    "        score_mean += score\n",
    "        print(f'{count} CV - score: {round(score, 4)}')\n",
    "        count += 1\n",
    "    time2 = time.time() - time1\n",
    "    print(f\"Total Time Run: {round(time2 / 60,2)}\")\n",
    "    gc.collect()\n",
    "    print(f'Mean ROC_AUC: {score_mean / FOLDS}')\n",
    "    del X_tr, X_vl, y_tr, y_vl, clf, score\n",
    "    return -(score_mean / FOLDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "space = {\n",
    "    # The maximum depth of a tree, same as GBM.\n",
    "    # Used to control over-fitting as higher depth will allow model \n",
    "    # to learn relations very specific to a particular sample.\n",
    "    # Should be tuned using CV.\n",
    "    # Typical values: 3-10\n",
    "    'max_depth': hp.quniform('max_depth', 2, 6, 1),\n",
    "    \n",
    "    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity \n",
    "    # (meaning pulling weights to 0). It can be more useful when the objective\n",
    "    # is logistic regression since you might need help with feature selection.\n",
    "    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),\n",
    "    \n",
    "    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this\n",
    "    # approach can be more useful in tree-models where zeroing \n",
    "    # features might not make much sense.\n",
    "    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.4),\n",
    "    \n",
    "    # eta: Analogous to learning rate in GBM\n",
    "    # Makes the model more robust by shrinking the weights on each step\n",
    "    # Typical final values to be used: 0.01-0.2\n",
    "    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),\n",
    "    \n",
    "    # colsample_bytree: Similar to max_features in GBM. Denotes the \n",
    "    # fraction of columns to be randomly samples for each tree.\n",
    "    # Typical values: 0.5-1\n",
    "    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),\n",
    "    \n",
    "    # A node is split only when the resulting split gives a positive\n",
    "    # reduction in the loss function. Gamma specifies the \n",
    "    # minimum loss reduction required to make a split.\n",
    "    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.\n",
    "    'gamma': hp.uniform('gamma', 0.01, .7),\n",
    "    \n",
    "    # more increases accuracy, but may lead to overfitting.\n",
    "    # num_leaves: the number of leaf nodes to use. Having a large number \n",
    "    # of leaves will improve accuracy, but will also lead to overfitting.\n",
    "    'num_leaves': hp.choice('num_leaves', list(range(10, 100, 10))),\n",
    "    \n",
    "    # specifies the minimum samples per leaf node.\n",
    "    # the minimum number of samples (data) to group into a leaf. \n",
    "    # The parameter can greatly assist with overfitting: larger sample\n",
    "    # sizes per leaf will reduce overfitting (but may lead to under-fitting).\n",
    "    'min_child_samples': hp.choice('min_child_samples', list(range(10, 200, 20))),\n",
    "    \n",
    "    # subsample: represents a fraction of the rows (observations) to be \n",
    "    # considered when building each subtree. Tianqi Chen and Carlos Guestrin\n",
    "    # in their paper A Scalable Tree Boosting System recommend \n",
    "    'subsample': hp.choice('subsample', [0.7, 0.8, 0.9, 1]),\n",
    "    \n",
    "    # randomly select a fraction of the features.\n",
    "    # feature_fraction: controls the subsampling of features used\n",
    "    # for training (as opposed to subsampling the actual training data in \n",
    "    # the case of bagging). Smaller fractions reduce overfitting.\n",
    "    'feature_fraction': hp.uniform('feature_fraction', 0.7, 1),\n",
    "    \n",
    "    # randomly bag or subsample training data.\n",
    "    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1)\n",
    "    \n",
    "    # bagging_fraction and bagging_freq: enables bagging (subsampling) \n",
    "    # of the training data. Both values need to be set for bagging to be used.\n",
    "    # The frequency controls how often (iteration) bagging is used. Smaller\n",
    "    # fractions and frequencies reduce overfitting.\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set algoritm parameters\n",
    "with Timer('XGBoost, Search') as t:\n",
    "    best = fmin(fn=objective,\n",
    "                space=space,\n",
    "                algo=tpe.suggest,\n",
    "                max_evals=20)\n",
    "    best_params = space_eval(space, best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Print best parameters\n",
    "best_params['max_depth'] = int(best_params['max_depth'])\n",
    "print(\"BEST PARAMS: \", best_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = xgb.XGBClassifier(\n",
    "    n_estimators=300,\n",
    "    **best_params,\n",
    "    tree_method='hist',\n",
    "    eval_metric=\"auc\",\n",
    "    n_jobs=-1,\n",
    "    scale_pos_weight=136\n",
    ")\n",
    "\n",
    "clf.fit(X_train, y_train.values.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_y_train_pred = clf.predict_proba(X_train)[:,1]\n",
    "plotROC(y_train, xgb_y_train_pred, 'XGBoost-Train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_y_test_pred = clf.predict_proba(X_test)[:,1]\n",
    "plotROC(y_test, xgb_y_test_pred, 'XGBoost-Test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train with all data\n",
    "with Timer('XGBoost, Train') as t:\n",
    "    clf.fit(X, y.values.ravel(), verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train auc\n",
    "xgb_y_all_pred = clf.predict_proba(X)[:,1]\n",
    "roc_auc_score(y, xgb_y_all_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plotROC(y, xgb_y_all_pred, 'XGBoost-Train-AllData')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.DataFrame()\n",
    "result['id'] = ids\n",
    "with Timer('XGBoost, Prediction') as t:\n",
    "    result['target'] = clf.predict_proba(testX)[:,1]\n",
    "result.to_csv('xgb.csv', index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Easy Ensemble Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objectiveEasy(params):\n",
    "    time1 = time.time()\n",
    "    params = {\n",
    "        'sampling_strategy': params['sampling_strategy'],\n",
    "    }\n",
    "\n",
    "    print(\"\\n############## New Run ################\")\n",
    "    print(f\"params = {params}\")\n",
    "    FOLDS = 5\n",
    "    count = 1\n",
    "    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)\n",
    "    score_mean = 0\n",
    "    for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()):\n",
    "        clf = EasyEnsembleClassifier(**params,\n",
    "                                    random_state=0,\n",
    "                                    n_estimators=300,\n",
    "                                    n_jobs=-1,\n",
    "                                    verbose=0)\n",
    "\n",
    "        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]\n",
    "        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]\n",
    "        \n",
    "        clf.fit(X_tr, y_tr.values.ravel())\n",
    "        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)\n",
    "        score_mean += score\n",
    "        print(f'{count} CV - score: {round(score, 4)}')\n",
    "        count += 1\n",
    "    time2 = time.time() - time1\n",
    "    print(f\"Total Time Run: {round(time2 / 60,2)}\")\n",
    "    gc.collect()\n",
    "    print(f'Mean ROC_AUC: {score_mean / FOLDS}')\n",
    "    del X_tr, X_vl, y_tr, y_vl, clf, score\n",
    "    return -(score_mean / FOLDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spaceEasy = {\n",
    "    'sampling_strategy': hp.choice('sampling_strategy', [0.7, 0.8, 0.9, 1.0, 'auto'])\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set algoritm parameters\n",
    "with Timer('EasyEnsamble, Search') as t:\n",
    "    bestEasy = fmin(fn=objectiveEasy,\n",
    "                space=spaceEasy,\n",
    "                algo=tpe.suggest,\n",
    "                max_evals=5)\n",
    "\n",
    "    # Print best parameters\n",
    "    bestEasy_params = space_eval(spaceEasy, bestEasy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bestEasy_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = EasyEnsembleClassifier(**bestEasy_params,\n",
    "                            random_state=0,\n",
    "                            n_estimators=300,\n",
    "                            n_jobs=-1,\n",
    "                            verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# training roc\n",
    "easy_y_train_pred = clf.predict_proba(X_train)[:,1]\n",
    "plotROC(y_train, easy_y_train_pred, 'EasyEnsamble-Train')\n",
    "# test roc\n",
    "easy_y_test_pred = clf.predict_proba(X_test)[:,1]\n",
    "plotROC(y_test, easy_y_test_pred, 'EasyEnsamble-Test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit all data\n",
    "with Timer('EasyEnsamble, Train') as t:\n",
    "    clf.fit(X, y.values.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "easy_y_all_pred = clf.predict_proba(X)[:, 1]\n",
    "plotROC(y, easy_y_all_pred, 'EasyEnsamble-Train-AllData')\n",
    "roc_auc_score(y, easy_y_all_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pridict\n",
    "result = pd.DataFrame()\n",
    "result['id'] = ids\n",
    "with Timer('EasyEnsamble, Prediction') as t:\n",
    "    result['target'] = clf.predict_proba(testX)[:, 1]\n",
    "result.to_csv('EasyEnsemble.csv', index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Balanced Bagging Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objectiveBalance(params):\n",
    "    time1 = time.time()\n",
    "    params = {\n",
    "        'sampling_strategy': params['sampling_strategy'],\n",
    "    }\n",
    "\n",
    "    print(\"\\n############## New Run ################\")\n",
    "    print(f\"params = {params}\")\n",
    "    FOLDS = 5\n",
    "    count = 1\n",
    "    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)\n",
    "    score_mean = 0\n",
    "    for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()):\n",
    "        clf = BalancedBaggingClassifier(**params,\n",
    "                                    random_state=0,\n",
    "                                    n_estimators=300,\n",
    "                                    n_jobs=-1,\n",
    "                                    verbose=0)\n",
    "\n",
    "        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]\n",
    "        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]\n",
    "        \n",
    "        clf.fit(X_tr, y_tr.values.ravel())\n",
    "        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)\n",
    "        score_mean += score\n",
    "        print(f'{count} CV - score: {round(score, 4)}')\n",
    "        count += 1\n",
    "    time2 = time.time() - time1\n",
    "    print(f\"Total Time Run: {round(time2 / 60,2)}\")\n",
    "    gc.collect()\n",
    "    print(f'Mean ROC_AUC: {score_mean / FOLDS}')\n",
    "    del X_tr, X_vl, y_tr, y_vl, clf, score\n",
    "    return -(score_mean / FOLDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spaceBalance = spaceEasy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with Timer('BalancedEnsamble, Search') as t:\n",
    "    # Set algoritm parameters\n",
    "    bestBalance = fmin(fn=objectiveBalance,\n",
    "                space=spaceBalance,\n",
    "                algo=tpe.suggest,\n",
    "                max_evals=5)\n",
    "  \n",
    "    # Print best parameters\n",
    "    bestBalance_params = space_eval(spaceBalance, bestBalance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = BalancedBaggingClassifier(**bestBalance_params,\n",
    "                                random_state=0,\n",
    "                                n_estimators=300,\n",
    "                                n_jobs=-1,\n",
    "                                verbose=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# training roc\n",
    "balance_y_train_pred = clf.predict_proba(X_train)[:,1]\n",
    "plotROC(y_train, balance_y_train_pred, 'BalancedEnsamble-Train')\n",
    "# test roc\n",
    "balance_y_test_pred = clf.predict_proba(X_test)[:,1]\n",
    "plotROC(y_test, balance_y_test_pred, 'BalancedEnsamble-Test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with Timer('BalancedEnsamble, Train') as t:\n",
    "    # train with all data\n",
    "    clf.fit(X, y.values.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "balance_y_all_pred = clf.predict_proba(X)[:, 1]\n",
    "plotROC(y, balance_y_all_pred, 'BalancedEnsamble-Train-AllData')\n",
    "roc_auc_score(y, balance_y_all_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.DataFrame()\n",
    "result['id'] = ids\n",
    "with Timer('BalancedEnsamble, Prediction') as t:\n",
    "    result['target'] = clf.predict_proba(testX)[:, 1]\n",
    "result.to_csv('BalancedBaggingClassifier.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def subplotRoc(y_test, y_score, pltName):\n",
    "    fpr = dict()\n",
    "    tpr = dict()\n",
    "    roc_auc = dict()\n",
    "    fpr[2], tpr[2], _ = roc_curve(y_test, y_score)\n",
    "    roc_auc[2] = auc(fpr[2], tpr[2])\n",
    "\n",
    "    # Compute micro-average ROC curve and ROC area\n",
    "    fpr[\"micro\"], tpr[\"micro\"], _ = roc_curve(y_test, y_score)\n",
    "    roc_auc[\"micro\"] = auc(fpr[\"micro\"], tpr[\"micro\"])\n",
    "    # plot it\n",
    "    lw = 2\n",
    "    plt.plot(fpr[2], tpr[2], color='darkorange',\n",
    "             lw=lw, label='AUC=%0.2f' % roc_auc[2])\n",
    "    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n",
    "    plt.xlim([0.0, 1.0])\n",
    "    plt.ylim([0.0, 1.05])\n",
    "    plt.xlabel('FPR')\n",
    "    plt.ylabel('TPR')\n",
    "    plt.title('ROC of ' + pltName)\n",
    "    plt.legend(loc=\"best\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3 x 3 ROC plot\n",
    "fig = plt.subplots(3, 3, figsize=(16, 14))\n",
    "plt.subplots_adjust(hspace=0.6)\n",
    "plt.suptitle('ROC of All')\n",
    "plt.subplot(331)\n",
    "subplotRoc(y_train, xgb_y_train_pred, 'XGBoost-Train')\n",
    "plt.subplot(332)\n",
    "subplotRoc(y_test, xgb_y_test_pred, 'XGBoost-Test')\n",
    "plt.subplot(333)\n",
    "subplotRoc(y, xgb_y_all_pred, 'XGBoost-Train-All-Data')\n",
    "plt.subplot(334)\n",
    "subplotRoc(y_train, easy_y_train_pred, 'EasyEnsamble-Train')\n",
    "plt.subplot(335)\n",
    "subplotRoc(y_test, easy_y_test_pred, 'EasyEnsamble-Test')\n",
    "plt.subplot(336)\n",
    "subplotRoc(y, easy_y_all_pred, 'EasyEnsamble-Train-AllData')\n",
    "plt.subplot(337)\n",
    "subplotRoc(y_train, balance_y_train_pred, 'BalancedEnsamble-Train')\n",
    "plt.subplot(338)\n",
    "subplotRoc(y_test, balance_y_test_pred, 'BalancedEnsamble-Test')\n",
    "plt.subplot(339)\n",
    "subplotRoc(y, balance_y_all_pred, 'BalancedEnsamble-Train-AllData')\n",
    "\n",
    "plt.savefig('ROC.png', bbox_inches='tight', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
